Many Thanks to Albert van Breemen who created a great notebook, which I forked to use as a starting point for my own data exploration and file prep: https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration
## Load some default Python modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
plt.style.use('seaborn-whitegrid')
## Read data in pandas dataframe, using nrows for exploration, then chunksize for later processing.
train_df = pd.read_csv('./datafiles/train.csv', nrows=200000, parse_dates=["pickup_datetime"]) ##TODO: change nrows back to 2m
## List first few rows (datapoints)
train_df.head()
## Check datatypes
train_df.dtypes
## Check statistics of the features
train_df.describe()
Is there any missing data? If so, how many rows are affected? Does the test set contain missing data?
Swapped Lat/Lon? Or just junk coordinates?
Load the test data to sanity check any changes to the train set.
## Read data in pandas dataframe
test_df = pd.read_csv('./datafiles/test.csv', parse_dates=["pickup_datetime"])
test_df.head(5)
test_df.describe()
def fix_fares(df, verbose=False):
if verbose:
print("Removing all fares less than $2.50:")
old_size = len(df)
print("Old size: {}".format(old_size))
df = df[df.fare_amount >= 2.5]
if verbose:
new_size = len(df)
print("New size: {}".format(new_size))
difference = old_size - new_size
percent = (difference / old_size) * 100
print("Dropped {} records, or {:.2f}%".format(difference, percent))
return df
train_df = fix_fares(train_df, True)
Next up, lets check to see how much missing data exists in this dataset and compare it to the test set. Then we'll drop any rows with missing data from the training set, since it'd be better to use less data than to let bad data influence the model.
print("Train Dataset NaNs: {}".format(train_df.isnull().sum()))
print("Test Dataset NaNs: {}".format(test_df.isnull().sum()))
def drop_nan(df, verbose=False):
if verbose:
print("Dropping all rows with NaNs:")
old_size = len(df)
print("Old size: {}".format(old_size))
df.dropna(how ='any', axis ='rows', inplace=True)
if verbose:
new_size = len(df)
print("New size: {}".format(new_size))
difference = old_size - new_size
percent = (difference / old_size) * 100
print("Dropped {} records, or {:.2f}%".format(difference, percent))
return df
train_df = drop_nan(train_df, True)
First, lets check to see if the test set has any outliers.
## Minimum and maximum longitude test set
min(test_df.pickup_longitude.min(), test_df.dropoff_longitude.min()), \
max(test_df.pickup_longitude.max(), test_df.dropoff_longitude.max())
## Minimum and maximum latitude test
min(test_df.pickup_latitude.min(), test_df.dropoff_latitude.min()), \
max(test_df.pickup_latitude.max(), test_df.dropoff_latitude.max())
def select_outside_boundingbox(df, BB):
filter_df = df.loc[(df['pickup_longitude'] < BB[0]) | (df['pickup_longitude'] > BB[1]) | \
(df['pickup_latitude'] < BB[2]) | (df['pickup_latitude'] > BB[3]) | \
(df['dropoff_longitude'] < BB[0]) | (df['dropoff_longitude'] > BB[1]) | \
(df['dropoff_latitude'] < BB[2]) | (df['dropoff_latitude'] > BB[3])]
return filter_df
NYC_BB = (-74.5, -72.8, 40.5, 41.8)
latlon_outliers = select_outside_boundingbox(train_df, NYC_BB)
print("Train Dataset outliers: {}".format(latlon_outliers.sum()))
latlon_outliers
I'm noticing a lot of rows with at least one 0 in a column (Latitude, Longitude, and/or Passengers), which doesn't make much sense in the scope of this problem, so I'm going to drop all rows with a 0, as junk data.
def drop_0s(df, verbose=False):
if verbose:
print("Dropping all rows with 0s:")
old_size = len(df)
print("Old size: {}".format(old_size))
df = df.loc[~(df == 0).any(axis=1)]
if verbose:
new_size = len(df)
print("New size: {}".format(new_size))
difference = old_size - new_size
percent = (difference / old_size) * 100
print("Dropped {} records, or {:.2f}%".format(difference, percent))
return df
train_df = drop_0s(train_df, True)
train_df.describe()
latlon_outliers = drop_0s(latlon_outliers, True)
latlon_outliers.describe()
Now it looks like a significant portion of these might have useable data, just with the latitude and/or longitude accidentally swapped. Let's graph it out to see what we're dealing with.
import matplotlib.patches as patches
def plot(df, BB, s=10, alpha=0.2):
fig, axs = plt.subplots(1, 4, figsize=(20,5))
axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
axs[0].set_xlabel('Latitude')
axs[0].set_ylabel('Longitude')
axs[0].set_title('Pickup locations')
axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='r', s=s)
axs[1].set_ylabel('Latitude')
axs[1].set_ylabel('Longitude')
axs[1].set_title('Dropoff locations')
axs[2].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
axs[2].set_xlim((BB[2], BB[3]))
axs[2].set_xlabel('Latitude')
axs[2].set_ylim((BB[0], BB[1]))
axs[2].set_ylabel('Longitude')
axs[2].set_title('Inverted Pickup locations')
axs[3].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='r', s=s)
axs[3].set_xlim((BB[2], BB[3]))
axs[3].set_ylabel('Latitude')
axs[3].set_ylim((BB[0], BB[1]))
axs[3].set_ylabel('Longitude')
axs[3].set_title('Inverted Dropoff locations')
plot(latlon_outliers, NYC_BB)
def select_within_boundingbox(df, BB):
filter_df = df.loc[(df['pickup_longitude'] >= BB[0]) & (df['pickup_longitude'] <= BB[1]) & \
(df['pickup_latitude'] >= BB[2]) & (df['pickup_latitude'] <= BB[3]) & \
(df['dropoff_longitude'] >= BB[0]) & (df['dropoff_longitude'] <= BB[1]) & \
(df['dropoff_latitude'] >= BB[2]) & (df['dropoff_latitude'] <= BB[3])]
return filter_df
inverted_NYC_BB = (40.5, 41.8, -74.5, -72.8)
inverted_outliers = select_within_boundingbox(latlon_outliers, inverted_NYC_BB)
inverted_outliers.describe()
The bounding box with inverted coordinates is located in Antartica, so obviously that part of the data is invalid. However, there are 955 records here which have these inverted coordinates, and it's pretty easy to see how this might have happened: the latitude and longitude probably were accidentally swapped. And notice how all the datapoints within the swapped window of lat/lon all seem to be clustered together? It appears that perhaps a cab or a cab company was mistakenly configured for the latitude to record in the longitude column and vice versa.
While 955 records out of 2 million is not even statistically meaningful, it's an easy enough task to convert this data back to a usable state and which likely is accurate to the original source. So I'll end up swapping these values and reincorporating them into the train dataframe.
def swap_inverted(df):
fixed_df = df.rename(columns={'pickup_longitude' : 'pickup_latitude', 'pickup_latitude' : 'pickup_longitude',
'dropoff_longitude' : 'dropoff_latitude', 'dropoff_latitude' : 'dropoff_longitude'})
col_list = fixed_df.columns.tolist()
col_list[3], col_list[4], col_list[5], col_list[6] = col_list[4], col_list[3], col_list[6], col_list[5]
fixed_df = fixed_df[col_list]
return fixed_df
fixed_outliers = swap_inverted(inverted_outliers)
fixed_outliers.describe()
## Now we'll remove all rows with a datapoint that doesn't fall within the bounding box for NYC coordinates
print("Old size: {}".format(len(train_df)))
train_df = select_within_boundingbox(train_df, NYC_BB)
print("New size: {}".format(len(train_df)))
train_df.describe()
train_df_copy = train_df # Created a copy so as to avoid the possibility of adding the fixed outliers multiple times
train_df = pd.concat([train_df_copy, fixed_outliers], ignore_index=True)
train_df_copy = None # Doing this to try to be a bit more memory efficient
train_df.describe()
Now that the investigation is over and a plan of action has been decided, I'm turning all of the transformation steps into a function, which will later be used on the whole dataset.
def latlon_outlier_fix(df, BB, verbose=False):
## This also fixes 0s in columns like passenger_count
df = drop_0s(df, verbose)
outliers = select_outside_boundingbox(df, BB)
if verbose:
print("Dataset outliers: {}".format(len(outliers)))
inverted_BB = (BB[2], BB[3], BB[0], BB[1])
inverted_outliers = select_within_boundingbox(outliers, inverted_BB)
if verbose:
print("Inverted outliers: {}".format(len(inverted_outliers)))
fixed_outliers = swap_inverted(inverted_outliers)
if verbose:
print("")
old_size = len(df)
print("Old size: {}".format(old_size))
df = select_within_boundingbox(df, BB)
if verbose:
new_size = len(df)
print("New size: {}".format(new_size))
difference = old_size - new_size
percent = (difference / old_size) * 100
print("Dropped {} records, or {:.2f}%".format(difference, percent))
df = pd.concat([df, fixed_outliers])
if verbose:
newer_size = len(df)
print("Final size: {}".format(newer_size))
difference = newer_size - new_size
percent = (difference / new_size) * 100
print("Fixed {} records, or {:.2f}%".format(difference, percent))
return df
Since the model would just view the dates as a string, and at best just one-hot encode each second represented into a bucket (or at worst, since that would be next to impossible to compute on such a large dataset), let's bucketize each datetime into all its components ranging from hour of the day to the year. For this initial model, I'll leave these categorical values as-is, but later I'll perform feature selection and run feature crosses to distil this into only the most useful data.
## Before we begin, let's measure the memory impact that adding new columns might have
def mem_usage(pandas_obj):
if isinstance(pandas_obj,pd.DataFrame):
usage_b = pandas_obj.memory_usage(deep=True).sum()
else: # We assume if not a df it's a series
usage_b = pandas_obj.memory_usage(deep=True)
usage_mb = usage_b / 1024 ** 2 # Convert bytes to megabytes
return "{:03.2f} MB".format(usage_mb)
print("Total Mem Usage: {}".format(mem_usage(train_df)))
def prepare_time_features(df, verbose=False, drop=False):
df["hour"] = df.pickup_datetime.dt.hour
df["day_of_week"] = df.pickup_datetime.dt.weekday
df["day_of_month"] = df.pickup_datetime.dt.day
df["week"] = df.pickup_datetime.dt.week
df["month"] = df.pickup_datetime.dt.month
df["year"] = df.pickup_datetime.dt.year - 2000 # Reducing to 2 digits for less memory usage
if drop:
df.drop(columns=['pickup_datetime'], inplace=True)
if verbose:
print("Total Mem Usage: {}".format(mem_usage(train_df)))
display(df.head())
return df
train_df = prepare_time_features(train_df, True, True)
test_df = prepare_time_features(test_df, False, True)
train_df.describe()
train_df.dtypes
print("Total Mem Usage: {}".format(mem_usage(train_df)))
df_int = train_df.select_dtypes(include=['int64', 'int32', 'int16', 'int8', 'int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')
print("Int Mem Usage: {}".format(mem_usage(df_int)))
print("New Int Mem Usage: {}".format(mem_usage(converted_int)))
compare_ints = pd.concat([df_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)
train_df[df_int.columns] = df_int.apply(pd.to_numeric, downcast='unsigned')
train_df.dtypes
df_float = train_df.select_dtypes(include=['float64', 'float32', 'float16', 'float'])
converted_float = df_float.apply(pd.to_numeric,downcast='float')
print("Float Mem Usage: {}".format(mem_usage(df_float)))
print("New Float Mem Usage: {}".format(mem_usage(converted_float)))
compare_floats = pd.concat([df_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)
train_df[df_float.columns] = df_float.apply(pd.to_numeric,downcast='float')
print(train_df.dtypes)
print("Total Mem Usage: {}".format(mem_usage(train_df)))
With noticable memory reduction, it'll be useful to have this packaged into a function that can be used on the whole dataset.
def downcast(df, verbose=False):
if verbose:
print("Original Mem Usage: {}".format(mem_usage(df)))
df_int = df.select_dtypes(include=['int64', 'int32', 'int16', 'int8', 'int'])
df[df_int.columns] = df_int.apply(pd.to_numeric,downcast='unsigned')
df_float = df.select_dtypes(include=['float64', 'float32', 'float16', 'float'])
df[df_float.columns] = df_float.apply(pd.to_numeric,downcast='float')
if verbose:
print("Optimized Mem Usage: {}".format(mem_usage(df)))
return df
Sice we're dealing with location data, lets plot the coordinates on a map to get a better view of the data.
For this, Albert used the following websites:
Albert also defined a bounding box of interest by [long_min, long_max, latt_min, latt_max] using the minimum and maximum coordinates from the testset.
From Open Street Map Albert grabbed a map and dropped any datapoints outside that box.
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')
## load extra image to zoom in on NYC
NYC_BB_zoom = (-74.3, -73.7, 40.5, 40.9)
nyc_map_zoom = plt.imread('https://aiblog.nl/download/nyc_-74.3_-73.7_40.5_40.9.png')
## This function will be used more often to plot data on the NYC map
def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2):
fig, axs = plt.subplots(1, 2, figsize=(16,10))
axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
axs[0].set_xlim((BB[0], BB[1]))
axs[0].set_ylim((BB[2], BB[3]))
axs[0].set_title('Pickup locations')
axs[0].imshow(nyc_map, zorder=0, extent=BB)
axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='r', s=s)
axs[1].set_xlim((BB[0], BB[1]))
axs[1].set_ylim((BB[2], BB[3]))
axs[1].set_title('Dropoff locations')
axs[1].imshow(nyc_map, zorder=0, extent=BB)
## Plot training data on map
plot_on_map(train_df, NYC_BB, nyc_map, s=1, alpha=0.3)
## Plot training data on map zoomed in
plot_on_map(train_df, NYC_BB_zoom, nyc_map_zoom, s=1, alpha=0.3)
## Plot test data on map
plot_on_map(test_df, NYC_BB, nyc_map, alpha=1.0, s=20)
def plot_hires(df, BB, figsize=(12, 12), ax=None, c=('r', 'b')):
if ax == None:
fig, ax = plt.subplots(1, 1, figsize=figsize)
idx = select_within_boundingbox(df, BB)
ax.scatter(idx.pickup_longitude, idx.pickup_latitude, c=c[0], s=0.01, alpha=0.5)
ax.scatter(idx.dropoff_longitude, idx.dropoff_latitude, c=c[1], s=0.01, alpha=0.5)
An other interesting way to visualize the data Albert learned from this kernel: https://www.kaggle.com/drgilermo/dynamics-of-new-york-city-animation. By using very small dot sizes the actual streets of New York become visible.
plot_hires(train_df, (-74.1, -73.7, 40.6, 40.9))
plot_hires(train_df, (-74, -73.95, 40.7, 40.8))
## Plot histogram of fare
train_df[train_df.fare_amount<100].fare_amount.hist(bins=100, figsize=(14,3))
plt.xlabel('fare $USD')
plt.title('Histogram');
The small spikes between \$40 and \$60 could indicate some fixed fare price (e.g. to/from airport). This will be explored further below.
These datapoints are clearly erraneous and I can't think of a way to fix them and get useful info from them, so we'll work to remove them. For this Albert used Photoshop to threshold on the blue color of the water and to cleanup the map. The resulting map is show below.
## Read nyc mask and turn into boolean map with land = True, water = False
nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
plt.figure(figsize=(8,8))
plt.imshow(nyc_map, zorder=0)
plt.imshow(nyc_mask, zorder=1, alpha=0.7); # note: True is show in black, False in white.
Next, we convert longitude/latitude coordinates to xy pixel coordinates using lonlat_to_xy. Note that the y coordinate is reversed as the image y-axis goes from top to bottom. Then a boolean index is calculated using nyc_mask and we use a function to remove the offending datapoints.
def remove_datapoints_from_water(df, BB, nyc_mask, verbose=False):
def lonlat_to_xy(longitude, latitude, dx, dy, BB):
return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
(dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
if verbose:
print("Removing all rows with a datapoint in the water:")
old_size = len(df)
print("Old size: {}".format(old_size))
# calculate for each lon,lat coordinate the xy coordinate in the mask map
pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude,
nyc_mask.shape[1], nyc_mask.shape[0], BB)
dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude,
nyc_mask.shape[1], nyc_mask.shape[0], BB)
# calculate boolean index
idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
df = df.loc[idx]
if verbose:
print("Number of trips in water: {}".format(np.sum(~idx)))
new_size = len(df)
print("New size: {}".format(new_size))
difference = old_size - new_size
percent = (difference / old_size) * 100
print("Dropped {} records, or {:.2f}%".format(difference, percent))
# return only datapoints on land
return df
train_df = remove_datapoints_from_water(train_df, NYC_BB, nyc_mask, verbose=True)
Now let's see if all outliers in the water are still there.
# plot training data
plot_on_map(train_df, NYC_BB, nyc_map)
The above scatterplot gives a quick impression of the density, but it's more accurate to use actual counts of datapoints. Lets plot out the hotspots:
def distance(lat1, lon1, lat2, lon2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
## Convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
## Haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r
def apply_distance(df, verbose=False):
df['distance_km'] = distance(df.pickup_latitude, df.pickup_longitude,
df.dropoff_latitude, df.dropoff_longitude)
if verbose:
display(df.distance_km.describe())
return df
## First calculate two arrays with datapoint density per sq km
n_lon, n_lat = 200, 200 # number of grid bins per longitude, latitude dimension
density_pickup, density_dropoff = np.zeros((n_lat, n_lon)), np.zeros((n_lat, n_lon)) # Prepare arrays
"""
To calculate the number of datapoints in a grid area, the numpy.digitize() function is used.
This function needs an array with the (location) bins for counting the number of datapoints per bin.
"""
bins_lon = np.zeros(n_lon+1) # bin
bins_lat = np.zeros(n_lat+1) # bin
delta_lon = (NYC_BB[1]-NYC_BB[0]) / n_lon # bin longutide width
delta_lat = (NYC_BB[3]-NYC_BB[2]) / n_lat # bin latitude height
bin_width_kms = distance(NYC_BB[2], NYC_BB[1], NYC_BB[2], NYC_BB[0]) / n_lon # Bin width in kms
bin_height_kms = distance(NYC_BB[3], NYC_BB[0], NYC_BB[2], NYC_BB[0]) / n_lat # Bin height in kms
for i in range(n_lon+1):
bins_lon[i] = NYC_BB[0] + i * delta_lon
for j in range(n_lat+1):
bins_lat[j] = NYC_BB[2] + j * delta_lat
## Digitize per longitude, latitude dimension
inds_pickup_lon = np.digitize(train_df.pickup_longitude, bins_lon)
inds_pickup_lat = np.digitize(train_df.pickup_latitude, bins_lat)
inds_dropoff_lon = np.digitize(train_df.dropoff_longitude, bins_lon)
inds_dropoff_lat = np.digitize(train_df.dropoff_latitude, bins_lat)
"""
Count per grid bin
note: as the density_pickup will be displayed as image, the first index is the y-direction,
the second index is the x-direction. Also, the y-direction needs to be reversed for
properly displaying (therefore the (n_lat-j) term)
"""
dxdy = bin_width_kms * bin_height_kms
for i in range(n_lon):
for j in range(n_lat):
density_pickup[j, i] = np.sum((inds_pickup_lon==i+1) & (inds_pickup_lat==(n_lat-j))) / dxdy
density_dropoff[j, i] = np.sum((inds_dropoff_lon==i+1) & (inds_dropoff_lat==(n_lat-j))) / dxdy
## Plot the density arrays
fig, axs = plt.subplots(2, 1, figsize=(18, 24))
axs[0].imshow(nyc_map, zorder=0, extent=NYC_BB);
im = axs[0].imshow(np.log1p(density_pickup), zorder=1, extent=NYC_BB, alpha=0.6, cmap='plasma')
axs[0].set_title('Pickup density [datapoints per sq km]')
cbar = fig.colorbar(im, ax=axs[0])
cbar.set_label('log(1 + # datapoints per sq km)', rotation=270)
axs[1].imshow(nyc_map, zorder=0, extent=NYC_BB);
im = axs[1].imshow(np.log1p(density_dropoff), zorder=1, extent=NYC_BB, alpha=0.6, cmap='plasma')
axs[1].set_title('Dropoff density [datapoints per sq km]')
cbar = fig.colorbar(im, ax=axs[1])
cbar.set_label('log(1 + # datapoints per sq km)', rotation=270)
The datapoints concentrate around Manhatten and the three airports (JFK, EWS, LGR), along with a hotspot near Seymour (upper right corner).
First, lets calculate the straight-line distance between points.
## Add new column to dataframe with distance in kms
train_df = apply_distance(train_df, True)
train_df.distance_km.hist(bins=50, figsize=(12,4))
plt.xlabel('distance km')
plt.title('Histogram ride distances in km')
It seems that most rides are just short rides, with a small peak at ~20 kms. This peak could be due to airport drives.
Let's also see if passenger_count changes anything.
train_df.groupby('passenger_count')['distance_km', 'fare_amount'].mean()
If the model can't beat a simple calculation of applying the average dollars per km, then the model won't be able to justify its computational cost. So, I'll use the average of \$3.40 USD/km (with anything under a total of $2.50 rounded up) as my baseline.
print("Average $USD/km : {:0.2f}".format(train_df.fare_amount.sum()/train_df.distance_km.sum()))
To confirm that more than just the distance goes into a taxi's fare, let's plot the two and if we don't get a straight line, we'll know that there's more that goes into it.
## Scatter plot distance - fare
fig, axs = plt.subplots(1, 2, figsize=(16,6))
axs[0].scatter(train_df.distance_km, train_df.fare_amount, alpha=0.2)
axs[0].set_xlabel('distance km')
axs[0].set_ylabel('fare $USD')
axs[0].set_title('All data')
## Zoom in on part of data
idx = (train_df.distance_km < 30) & (train_df.fare_amount < 100)
axs[1].scatter(train_df[idx].distance_km, train_df[idx].fare_amount, alpha=0.2)
axs[1].set_xlabel('distance km')
axs[1].set_ylabel('fare $USD')
axs[1].set_title('Zoom in on distance < 30 km, fare < $100');
Knowing all these factors we could theoretically hard code our own decision tree and try to recreate the specific algorithm that is inside each taxi meter; then we'd only have to predict the route the driver would choose to take and the travel time for that time of day. However, I believe that misses the point of machine learning and this competition, so I'm going to try to train my algorithm to pick up on all of these subtleties without having to spell everything out.
def split_distance(df, split_point=.1, verbose=False):
if verbose:
print("Removing all distances less than {} km:".format(split_point))
old_size = len(df)
print("Old size: {}".format(old_size))
long = df.loc[df['distance_km'] >= split_point]
short = df.loc[df['distance_km'] < split_point]
if verbose:
new_size = len(long)
print("New size: {}".format(new_size))
difference = old_size - new_size
percent = (difference / old_size) * 100
print("Separated out {} records, or {:.2f}%".format(difference, percent))
return short, long
short_train, train_df = split_distance(train_df, verbose=True)
train_df.describe()
short_train.describe()
short_train.loc[short_train['distance_km'] == 0.0].head()
test_df = apply_distance(test_df, True)
short_test, long_test = split_distance(test_df, verbose=True)
short_test.describe()
test_df.loc[test_df['distance_km'] == 0.0].head()
It's hard to imagine records for distances less than 0.1 km (328 feet or 100 meters; which is less than the length of most blocks) as being accurate data except in very rare edge cases. But only about 1% of the datapoints in the train and test set are affected by this discrepancy. They seem to happen at a relatively even distribution across a wide range of fares, locations, times and dates.
And since there seems to be no aparent rhyme or reason to these unreasonably short distances, their existence would probably only confuse the model, which is why I've removed them. Even though the test set also contains unreasonably short distances, I don't think there's a good way to accurately learn from the ones in the train set, and so the model will have to just guess at those based on other related features. With the RMSE metric, I don't think this 1% of datapoints will throw off the accuracy too drastically.
While we're questioning the validity of the test data, lets check it for datapoints in the water.
remove_datapoints_from_water(test_df, NYC_BB, nyc_mask, verbose=True)
len(test_df)
And sure enough, there are a couple of datapoints in the water, but that's not too bad.
It's not uncommon for taxis to charge fixed rates for trips to/from an airport. Let's check and see if that's a factor in this dataset.
## JFK airport coordinates, see https://www.travelmath.com/airport/JFK
jfk = (-73.7822222222, 40.6441666667)
nyc = (-74.0063889, 40.7141667)
def plot_location_fare(loc, name, range=1.5):
## Select all datapoints with dropoff location within range of airport
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
idx = (distance(train_df.pickup_latitude, train_df.pickup_longitude, loc[1], loc[0]) < range)
train_df[idx].fare_amount.hist(bins=100, ax=axs[0])
axs[0].set_xlabel('fare $USD')
axs[0].set_title('Histogram pickup location within {} kms of {}'.format(range, name))
idx = (distance(train_df.dropoff_latitude, train_df.dropoff_longitude, loc[1], loc[0]) < range)
train_df[idx].fare_amount.hist(bins=100, ax=axs[1])
axs[1].set_xlabel('fare $USD')
axs[1].set_title('Histogram dropoff location within {} kms of {}'.format(range, name));
plot_location_fare(jfk, 'JFK Airport')
It definitely looks like trips to and from an airport run at a fixed total rate.
Let's also check the other two big airports.
ewr = (-74.175, 40.69) # Newark Liberty International Airport, see https://www.travelmath.com/airport/EWR
lgr = (-73.87, 40.77) # LaGuardia Airport, see https://www.travelmath.com/airport/LGA
plot_location_fare(ewr, 'Newark Airport')
plot_location_fare(lgr, 'LaGuardia Airport')
While I could certainly anticipate these special cases, and create a complex filter to catch all the fixed fare airport rides, then train with extra columns. This would be not too dissimilar to skipping the model completely for these datapoints and hard coding in some rules for any rides to or from a certain distance from an airport. However, that neglects the whole point of machine learning, which is to avoid hard coding in complex rules. Furthermore, I'm using feature crosses on the lat/lon data so the model will likely be able to notice these patterns on its own and hopefully predict the flat rates without prompting.
I think the preparation of the data I've done so far has been sufficient to set this model up for success, so now I just need to apply it to all the data and feed it to the model.
def split_by_hash(df, train_percent=80):
"""
This function splits the dataset in a repeatable manner using hashes of the key column.
train_percent (int) is the percent of the dataset that gets divided into the train file, with the remaining percent going into the validate file.
example: train_percent=75 would result in roughly 25% of the data going into the validate file
"""
df["hashed_result"] = df.key.apply(hash) % 100
train = df[df.hashed_result < train_percent]
validate = df[df.hashed_result >= train_percent]
train.drop(columns='hashed_result', inplace=True)
validate.drop(columns='hashed_result', inplace=True)
return train, validate
The below function takes around 5 hours to run in full.
Note: I was able to run this smoothly until chunk 28, which ran into an IndexError in the remove_datapoints_from_water function, so I saved a copy of the first 27 chunks as a smaller dataset (if 27 million rows is considered small) and tried again. It encountered the error again on the same index 1282, but I couldn't find anything out of the ordinary. So in the interest of time, I try/except skipped that error, because it should hurt things too badly if a few datapoints in the water get missed. I also added the if count check so that I could just start appending to the previous file from where it left off.
def prepare_train_valid(file, BB, verbose=False, chunksize=1000000, train_percent=80):
"""
NOTE: If rerunning this, make sure to delete the previous v1-train/valid files first.
"""
count = 0
train_file = "taxi-train.csv"
validation_file = "taxi-valid.csv"
for df in pd.read_csv(file, chunksize=chunksize, parse_dates=["pickup_datetime"]):
count += 1
print("Processing chunk: {} \r".format(count),)
if count > 27:
df = fix_fares(df, verbose)
df = drop_nan(df, verbose)
df = drop_0s(df, verbose)
df = latlon_outlier_fix(df, BB, verbose)
df = prepare_time_features(df, verbose, drop=True)
try:
df = remove_datapoints_from_water(df, NYC_BB, nyc_mask, verbose)
except IndexError:
print("Ignored Index Error")
df = apply_distance(df, verbose)
short_train, df = split_distance(df, verbose)
# df = downcast(df, verbose) # Not using due to possible issues with dtypes in TF
train_df, validate_df = split_by_hash(df, train_percent)
train_df.to_csv(train_file, index=False, header=False, mode='a')
validate_df.to_csv(validation_file, index=False, header=False, mode='a')
prepare_train_valid("./datafiles/train.csv", NYC_BB)
...with compatible formatting if needed, but all of the non-destructive formatting was already done above during data exploration.
# test_df = downcast(test_df, True)
test_df.to_csv("taxi-test.csv", index=False)
keyless_test_df = test_df.drop(columns="key")
keyless_test_df.to_csv("keyless-taxi-test.csv", index=False)
This is for doing batch prediction on Google Cloud ML Engine.
test_df.to_json("taxi_test_unformatted.json", orient = "records", date_format = "epoch",
double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
keyless_test_df.to_json("keyless_taxi_test_unformatted.json", orient = "records", date_format = "epoch",
double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
import json
json_file = open('keyless_taxi_test_unformatted.json', 'r')
json_file = json.load(json_file)
new_json = open('keyless_taxi_test.json', 'a+')
for row in json_file:
new_row = json.dumps(row)
new_json.write(new_row + "\n")
new_json.close()
import json
json_file = open('taxi_test_unformatted.json', 'r')
json_file = json.load(json_file)
new_json = open('taxi_test.json', 'a+')
for row in json_file:
new_row = json.dumps(row)
new_json.write(new_row + "\n")
new_json.close()
import json
prediction_df = pd.DataFrame()
prediction_json = open("./datafiles/taxifare_prediction2.json", 'r')
for row in prediction_json:
jrow = json.loads(row)
prediction_row = pd.DataFrame.from_dict(jrow)
prediction_df = pd.concat([prediction_df, prediction_row], ignore_index=True)
prediction_json.close()
prediction_json = open("./datafiles/taxifare_prediction1.json", 'r')
for row in prediction_json:
jrow = json.loads(row)
prediction_row = pd.DataFrame.from_dict(jrow)
prediction_df = pd.concat([prediction_df, prediction_row], ignore_index=True)
prediction_json.close()
prediction_df.describe()
submission_df = test_df
submission_df['fare_amount'] = prediction_df
submission_df.head()
submission_df.to_csv('taxi-predict-v2.1.csv', columns=['key', 'fare_amount'], index=False)